# Load data and define sample of interest:
if (getwd() != working_folder) {
  stop("Wrong folder!")
}

full_data <- read.csv("data_11.csv", row.names = NULL)

# First criteria: Exclude class 10 from school 1, 
# for which I do not have data on students social network, 
# exclude youth-adult school students, 
# and exclude anyone that is older than 20 years old!

full_data$flag  <- as.numeric(full_data$school_id == 1 & full_data$class == 10)
first_criteria <- which(full_data$young_adult_educ == 1 | full_data$age > 20) 
full_data$flag[first_criteria] <- 1

table(full_data$flag)

# Second criteria: based on final status in administrative records
# Set which categories of school situation to exclude:

criteria_situation = c(3, 4, 5) # Failed by absence, abandon, transfer:
full_data %>% group_by(school_situation, is.na(id_interview)) %>%
  summarise("Obs" = n(), 
            "LP" =  mean(LP_grade, na.rm = T),
            "Math" = mean(MAT_grade, na.rm = T),
            "NA LP" = mean(is.na(LP_grade)),  
            "NA Math" = mean(is.na(MAT_grade)), 
            "high_school" = mean(high_school, na.rm = T),
            "Age" = mean(age, na.rm = T))

second_criteria <- which(full_data$school_situation %in% criteria_situation) 

full_data$flag[second_criteria] <- 1
table(full_data$flag)

# Third criteria: Number of missing values for grades (without edfis):
criteria_grades = c(1:11) # Only those with complete grade

third_criteria <- which(full_data$missing_wo_edfis %in% criteria_grades)     
length(third_criteria)      

full_data$flag[third_criteria] <- 1
table(full_data$flag)

# Fourth criteria: Exclude classes with a certain percentage of missing 
# AFTER ACCOUNTING FOR THE OTHER TWO CRITERIA:
full_data$valid_non_missing <- 0
full_data$valid_non_missing[which(is.na(full_data$id_interview) == F &
                                    full_data$flag == 0)] <- 1

table(full_data$valid_non_missing, is.na(full_data$id_interview), 
      useNA = "always")

table(full_data$valid_non_missing, full_data$flag, useNA = "always")

overall_classes <- group_by(full_data, school_id, class_id) %>%
  summarise("Total" = n(), 
            "Valid" = n() - sum(flag), 
            "Non-missing" = sum(!is.na(id_interview)), 
            "Share Valid Non-Missing" = sum(!is.na(id_interview))/(n() - 
                                                                     sum(flag))
            )

criteria_forth_share <- .70 # Keep only those in classes with at least this 
                            # ratio of non-missing students 
criteria_forth_size  <- 10  # Keep only those in classes with at least this 
                            # number of valid responses.

bad_class <- 
  which(overall_classes[,4] < criteria_forth_size | 
          overall_classes[,6] < criteria_forth_share)    

full_data$flag_class <- 0
for (i in 1:length(bad_class)) {
  # i = 1
  to_replace <-which(
    full_data$school_id == as.numeric(overall_classes[bad_class[i],1]) & 
      full_data$class_id == as.numeric(overall_classes[bad_class[i],2])
    )
  
  full_data$flag_class[to_replace] <- 1
}
table(full_data$flag_class, full_data$flag)    

students_id_to_delete <- which(full_data$flag == 1 | full_data$flag_class == 1)
length(students_id_to_delete)

saveRDS(students_id_to_delete, 
        "intermediary_outputs/students_id_to_delete.RData")   

